In [1]:
import pandas as pd
import numpy as np
breast_cancer_data = pd.read_csv('data/breast-cancer-wisconsin.tsv.gz',
sep='\t',
compression='gzip')
In [2]:
from collections import Counter
Counter(breast_cancer_data['class'].values)
Out[2]:
In [3]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(n_estimators=100, n_jobs=-1),
breast_cancer_data.drop('class', axis=1).values,
breast_cancer_data.loc[:, 'class'].values, cv=StratifiedKFold(n_splits=5, shuffle=True))
Out[3]:
In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data.drop('class', axis=1).values,
breast_cancer_data['class'].values,
stratify=breast_cancer_data['class'].values,
train_size=0.75, test_size=0.25)
clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)
plt.figure(figsize=(12, 7))
sb.swarmplot(y_train, clf.predict(X_train))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;
Out[4]:
In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data.drop('class', axis=1).values,
breast_cancer_data['class'].values,
stratify=breast_cancer_data['class'].values,
train_size=0.75, test_size=0.25)
clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)
plt.figure(figsize=(12, 7))
sb.swarmplot(y_test, clf.predict(X_test))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;
Out[5]:
In [6]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score, StratifiedKFold
breast_cancer_data = pd.read_csv('data/breast-cancer-wisconsin.tsv.gz',
sep='\t',
compression='gzip')
all_features = breast_cancer_data.drop('class', axis=1).values
all_classes = breast_cancer_data['class'].values
union_ops = [SelectKBest(k='all')]
for i, mwfl in enumerate(np.arange(0., 0.21, 0.01)):
union_ops.append(VotingClassifier(estimators=[('rf-mwfl={}'.format(mwfl),
RandomForestRegressor(n_estimators=100,
n_jobs=-1,
min_weight_fraction_leaf=mwfl))]))
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_weight_fraction_leaf=mwfl)
print('RF w/ mwfl={:0.2f} CV score: {:0.3f}'.format(
mwfl,
np.mean(cross_val_score(clf, all_features, all_classes, cv=StratifiedKFold(n_splits=5, shuffle=True)))))
clf = make_pipeline(make_union(*union_ops), RandomForestClassifier(n_estimators=100, n_jobs=-1))
print('Crowd machine CV score: {:0.3f}'.format(np.mean(cross_val_score(clf, all_features, all_classes, cv=StratifiedKFold(n_splits=5, shuffle=True)))))
In [7]:
import pandas as pd
spambase_data = pd.read_csv('data/spambase.tsv.gz',
sep='\t',
compression='gzip')
In [8]:
from collections import Counter
Counter(spambase_data['class'].values)
Out[8]:
In [9]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(n_estimators=100, n_jobs=-1),
spambase_data.drop('class', axis=1).values,
spambase_data.loc[:, 'class'].values,
cv=StratifiedKFold(n_splits=5, shuffle=True))
Out[9]:
In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(spambase_data.drop('class', axis=1).values,
spambase_data['class'].values,
stratify=spambase_data['class'].values,
train_size=0.75, test_size=0.25)
clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)
plt.figure(figsize=(12, 7))
sb.boxplot(y_train, clf.predict(X_train))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;
Out[10]:
In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(spambase_data.drop('class', axis=1).values,
spambase_data['class'].values,
stratify=spambase_data['class'].values,
train_size=0.75, test_size=0.25)
clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)
plt.figure(figsize=(12, 7))
sb.boxplot(y_test, clf.predict(X_test))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;
Out[11]:
In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score, StratifiedKFold
spambase_data = pd.read_csv('data/spambase.tsv.gz',
sep='\t',
compression='gzip')
all_features = spambase_data.drop('class', axis=1).values
all_classes = spambase_data['class'].values
union_ops = [SelectKBest(k='all')]
for i, mwfl in enumerate(np.arange(0., 0.21, 0.01)):
union_ops.append(VotingClassifier(estimators=[('rf-mwfl={}'.format(mwfl),
RandomForestRegressor(n_estimators=100,
n_jobs=-1,
min_weight_fraction_leaf=mwfl))]))
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_weight_fraction_leaf=mwfl)
print('RF w/ mwfl={:0.2f} CV score: {:0.3f}'.format(
mwfl,
np.mean(cross_val_score(clf, all_features, all_classes, cv=StratifiedKFold(n_splits=5, shuffle=True)))))
clf = make_pipeline(make_union(*union_ops), RandomForestClassifier(n_estimators=100, n_jobs=-1))
print('Crowd machine CV score: {:0.3f}'.format(np.mean(cross_val_score(clf, all_features, all_classes,
cv=StratifiedKFold(n_splits=5, shuffle=True)))))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: